{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0c3cf197",
   "metadata": {},
   "source": [
    "## Sample pipeline\n",
    "This is a sample pipeline drawn from a competition posted on [Kaggle](https://www.kaggle.com/ragharamya/loanprediction). A preprocessor followed by exploring multiple options in parallel is demonstrated below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "described-lover",
   "metadata": {},
   "outputs": [],
   "source": [
    "%config IPCompleter.use_jedi = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "simplified-summit",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Gender                object\n",
       "Married               object\n",
       "Dependents            object\n",
       "Education             object\n",
       "Self_Employed         object\n",
       "ApplicantIncome        int64\n",
       "CoapplicantIncome    float64\n",
       "LoanAmount           float64\n",
       "Loan_Amount_Term     float64\n",
       "Credit_History       float64\n",
       "Property_Area         object\n",
       "Loan_Status           object\n",
       "dtype: object"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "train = pd.read_csv('../resources/data/train_ctrUa4K.csv')\n",
    "test = pd.read_csv('../resources/data/test_lAUu6dG.csv')\n",
    "train = train.drop('Loan_ID', axis=1)\n",
    "train.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "confident-union",
   "metadata": {},
   "outputs": [],
   "source": [
    "# prepare the dataset for training\n",
    "\n",
    "X = train.drop('Loan_Status', axis=1)\n",
    "y = train['Loan_Status']\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f1793c1",
   "metadata": {},
   "source": [
    "## SKLearn pipeline\n",
    "Below, we show how SKLearn is used to create a pipeline and then fit for each of the pipelines to explore multiple models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "visible-compact",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "\n",
    "imputer = SimpleImputer(strategy='median')\n",
    "scaler = StandardScaler()\n",
    "\n",
    "numeric_transformer = Pipeline(steps=[\n",
    "    ('imputer', imputer),\n",
    "    ('scaler', scaler)])\n",
    "\n",
    "cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')\n",
    "cat_onehot = OneHotEncoder(handle_unknown='ignore')\n",
    "\n",
    "categorical_transformer = Pipeline(steps=[\n",
    "    ('imputer', cat_imputer),\n",
    "    ('onehot', cat_onehot)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "stupid-miracle",
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_features = train.select_dtypes(include=['int64', 'float64']).columns\n",
    "categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns\n",
    "from sklearn.compose import ColumnTransformer\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', numeric_transformer, numeric_features),\n",
    "        ('cat', categorical_transformer, categorical_features)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fundamental-builder",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "rf = Pipeline(steps=[('preprocessor', preprocessor),\n",
    "                      ('classifier', RandomForestClassifier())])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "above-masters",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, log_loss\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.svm import SVC, LinearSVC, NuSVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n",
    "classifiers = [\n",
    "    KNeighborsClassifier(3),\n",
    "    SVC(kernel=\"rbf\", C=0.025, probability=True),\n",
    "    NuSVC(probability=True),\n",
    "    RandomForestClassifier(),\n",
    "    GradientBoostingClassifier()\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "greatest-cancellation",
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier_results=[]\n",
    "for classifier in classifiers:\n",
    "    pipe = Pipeline(steps=[('preprocessor', preprocessor),\n",
    "                      ('classifier', classifier)])\n",
    "    pipe.fit(X_train, y_train)\n",
    "    pipe.predict(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fef872c4",
   "metadata": {},
   "source": [
    "## CodeFlare pipelines\n",
    "Below, we show how this can be done with CodeFlare pipelines approach."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "coordinate-gossip",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ray\n",
    "ray.shutdown()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "bfe20bd8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-06-20 11:53:04,117\tINFO services.py:1269 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'node_ip_address': '9.211.53.245',\n",
       " 'raylet_ip_address': '9.211.53.245',\n",
       " 'redis_address': '9.211.53.245:6379',\n",
       " 'object_store_address': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/plasma_store',\n",
       " 'raylet_socket_name': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928/sockets/raylet',\n",
       " 'webui_url': '127.0.0.1:8265',\n",
       " 'session_dir': '/tmp/ray/session_2021-06-20_11-53-02_767779_97928',\n",
       " 'metrics_export_port': 64339,\n",
       " 'node_id': '8c45750634c387b09b787fb7a0aa191df63f6d5274861bad27c00cb3'}"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ray.init()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "invisible-consensus",
   "metadata": {},
   "outputs": [],
   "source": [
    "import codeflare.pipelines.Datamodel as dm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "surface-recruitment",
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline = dm.Pipeline()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "humanitarian-boards",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_a = dm.EstimatorNode('preprocess', preprocessor)\n",
    "node_0 = dm.EstimatorNode('node_0', classifiers[0])\n",
    "node_1 = dm.EstimatorNode('node_1', classifiers[1])\n",
    "node_2 = dm.EstimatorNode('node_2', classifiers[2])\n",
    "node_3 = dm.EstimatorNode('node_3', classifiers[3])\n",
    "node_4 = dm.EstimatorNode('node_4', classifiers[4])\n",
    "\n",
    "pipeline.add_edge(node_a, node_0)\n",
    "pipeline.add_edge(node_a, node_1)\n",
    "pipeline.add_edge(node_a, node_2)\n",
    "pipeline.add_edge(node_a, node_3)\n",
    "pipeline.add_edge(node_a, node_4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "ef9ff37b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create input\n",
    "pipeline_input = dm.PipelineInput()\n",
    "xy = dm.Xy(X_train, y_train)\n",
    "pipeline_input.add_xy_arg(node_a, xy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "mineral-analyst",
   "metadata": {},
   "outputs": [],
   "source": [
    "import codeflare.pipelines.Runtime as rt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "literary-consolidation",
   "metadata": {},
   "outputs": [],
   "source": [
    "from codeflare.pipelines.Runtime import ExecutionType"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "educated-basement",
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline_output = rt.execute_pipeline(pipeline, ExecutionType.FIT, pipeline_input)\n",
    "node_0_output = pipeline_output.get_xyrefs(node_0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "4df7fb29",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<codeflare.pipelines.Datamodel.XYRef at 0x7f97087200d0>]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "41955886",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_out = ray.get(outputs[0][0].get_Xref())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "7e05451f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n",
       "       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',\n",
       "       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N',\n",
       "       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',\n",
       "       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',\n",
       "       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',\n",
       "       'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
       "       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n",
       "       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',\n",
       "       'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',\n",
       "       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',\n",
       "       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',\n",
       "       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y'], dtype=object)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_out"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
